Introduction

In 2016, 1 in 5 Americans reported taking four or more prescription drugs (“Public Opinion on Prescription…”, 2016). Also, high drug prices have become an increasing concern in the United States (Hancock, 2016 ; “Why Prescription Drugs…”, 2017). This project aims at exploring some of the factors associated with US prescription drug prices including disease research funding, drug need (prevalence), sales, and advertising. Additionally, we seek to understand the types of drugs that receive patents/exclusivity.

The Data

The Variables

The variables that we retained are:

#project website. can you see it? https://josiealford14.github.io/therealcostofdrugs/
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
library(dplyr)
library(rvest)
library(httr)
library(janitor)
library(stringr)
library(forcats)
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.2
library(viridis)
## Warning: package 'viridis' was built under R version 3.4.2
library(broom)
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.2

Dataset 1: Drug Prices

# # API code
# url = "https://data.medicaid.gov/resource/tau9-gfwr.csv"
# drug_price =
#   GET(url, query = list("$limit" = 9999999)) %>%
#   content("parsed") %>%
#   clean_names()

# # CSV code
drug_price =
  read_csv("../data/drug_price.csv") %>%
  clean_names()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   as_of_date = col_date(format = ""),
##   classification_for_rate_setting = col_character(),
##   corresponding_generic_drug_effective_date = col_character(),
##   corresponding_generic_drug_nadac_per_unit = col_character(),
##   effective_date = col_date(format = ""),
##   explanation_code = col_character(),
##   nadac_per_unit = col_double(),
##   ndc = col_character(),
##   ndc_description = col_character(),
##   otc = col_character(),
##   pharmacy_type_indicator = col_character(),
##   pricing_unit = col_character()
## )
drug_price = 
  drug_price %>%
  rename(trade_name = ndc_description,
         drug_type = classification_for_rate_setting,
         unit_price = nadac_per_unit) %>%
  mutate(trade_name = str_replace(trade_name, "[0-9]", "_"),
         trade_name = str_to_title(trade_name),
         otc = recode(otc, "Y" = "yes", "N" = "no"),
         otc = str_to_title(otc),
         otc = as.factor(otc)) %>% 
  separate(trade_name, into = c("trade_name", "remove"), sep = "_") %>% 
  separate(effective_date, into = c("year", "month", "date")) %>%
  select(-c(date, x1, remove)) %>%
  mutate(trade_name = trimws(trade_name, which = c("both"))) %>% 
  separate(trade_name, into = c("trade_name", "remove"), sep = " ") %>% 
  filter(is.na(remove)) %>%
  select(-remove) %>% 
  filter(pricing_unit == "GM" | pricing_unit == "EA") %>% 
  select(trade_name, everything()) 
## Warning: Too few values at 308308 locations: 1, 2, 7, 17, 18, 19, 30, 45,
## 48, 56, 59, 60, 64, 66, 68, 85, 86, 87, 92, 107, ...
## Warning: Too many values at 592722 locations: 5, 17, 18, 48, 68, 112, 128,
## 136, 138, 173, 196, 215, 222, 231, 235, 241, 265, 273, 279, 294, ...
## Warning: Too few values at 2730099 locations: 1, 3, 4, 6, 8, 9, 10, 11, 12,
## 13, 15, 16, 20, 21, 23, 24, 25, 26, 27, 28, ...
drug_price %>%
  filter(trade_name == "Abacavir") %>%
  ggplot(aes(x = year, y = unit_price)) + geom_violin(aes(fill = year), color = "blue", alpha = .5)

new_drug_price = drug_price %>% 
  group_by(trade_name, drug_type, otc, pricing_unit, year) %>% 
  summarize(avg_price = mean(unit_price, na.rm = TRUE))

new_drug_price
## # A tibble: 6,023 x 6
## # Groups:   trade_name, drug_type, otc, pricing_unit [?]
##    trade_name drug_type    otc pricing_unit  year avg_price
##         <chr>     <chr> <fctr>        <chr> <chr>     <dbl>
##  1                    G    Yes           EA  2013 0.2434375
##  2                    G    Yes           EA  2014 0.2430778
##  3                    G    Yes           EA  2015 0.2556137
##  4                    G    Yes           EA  2016 0.2538263
##  5                    G    Yes           EA  2017 0.2339103
##  6                    G    Yes           GM  2016 0.3077704
##  7                    G    Yes           GM  2017 0.3087167
##  8   Abacavir         G     No           EA  2013 5.3170850
##  9   Abacavir         G     No           EA  2014 4.7696039
## 10   Abacavir         G     No           EA  2015 3.6822746
## # ... with 6,013 more rows

Dataset 2: Funding for Research

url = "https://report.nih.gov/categorical_spending.aspx"
NIH_xml = read_html(url)
NIH_funding = (NIH_xml %>% html_nodes(css = "table"))[[3]] %>% html_table() %>% as_tibble()
NIH_funding = NIH_funding %>%
  clean_names() %>%
  mutate(US_mortality_2015 = x2015_us_mortality_19)

NIH_funding$x2015_us_prevalence_standard_error_19  = recode(NIH_funding$x2015_us_prevalence_standard_error_19 , "-" = "NA")
NIH_funding$US_mortality_2015 = recode(NIH_funding$US_mortality_2015, "-" = "NA")

NIH_funding = NIH_funding %>%
  separate(x2015_us_prevalence_standard_error_19, into = c("prevalence", "remove"), sep = "\\%") %>%
  select(-x2015_us_mortality_19, -remove)
## Warning: Too many values at 36 locations: 7, 19, 21, 24, 25, 35, 37, 40,
## 46, 48, 54, 59, 74, 80, 81, 89, 93, 114, 115, 117, ...
## Warning: Too few values at 246 locations: 1, 2, 3, 4, 5, 6, 8, 9, 10, 11,
## 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, ...
NIH_funding$fy_2013actual = recode(NIH_funding$fy_2013actual, "+" = "NNA")
NIH_funding$fy_2014actual = recode(NIH_funding$fy_2014actual, "+" = "NNA")
NIH_funding$fy_2015actual = recode(NIH_funding$fy_2015actual, "+" = "NNA")
NIH_funding$fy_2016actual = recode(NIH_funding$fy_2016actual, "+" = "NNA")

NIH_funding = NIH_funding %>%
  mutate(fy_2013 = as.numeric(substring(fy_2013actual, 2))) %>%
  mutate(fy_2014 = as.numeric(substring(fy_2014actual, 2))) %>%
  mutate(fy_2015 = as.numeric(substring(fy_2015actual, 2))) %>%
  mutate(fy_2016 = as.numeric(substring(fy_2016actual, 2))) %>%
  mutate(fy_2017 = as.numeric(substring(fy_2017estimated_enacted, 2))) %>%
  mutate(fy_2018estimated = as.numeric(substring(fy_2018estimated, 2))) 
## Warning in evalq(as.numeric(substring(fy_2013actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2014actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2015actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2016actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2017estimated_enacted, 2)),
## <environment>): NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2018estimated, 2)),
## <environment>): NAs introduced by coercion
NIH_funding$prevalence = as.numeric(NIH_funding$prevalence)
## Warning: NAs introduced by coercion
NIH_funding$US_mortality_2015 = as.numeric(NIH_funding$US_mortality_2015)
## Warning: NAs introduced by coercion
NIH_funding = NIH_funding %>%
  select(-c(fy_2013actual, fy_2014actual, fy_2015actual, fy_2016actual, fy_2017estimated_enacted))

final_NIH_funding = NIH_funding %>%
  rename(disease = research_disease_areas_dollars_in_millions_and_rounded, 
         '2013' = fy_2013,
         '2014' = fy_2014,
         '2015' = fy_2015,
         '2016' = fy_2016,
         '2017' = fy_2017,
         '2018' = fy_2018estimated) %>%
  gather(key = "year", value = funding, c("2013", "2014", "2015", "2016", "2017", "2018")) %>% 
  mutate(disease = str_replace(disease, "[0-9]", "_"),
         disease = str_replace(disease, "\\-", ""),
         disease = str_replace(disease, "ALS", "Amyotrophic Lateral Sclerosis"),
         disease = str_replace(disease, "Sleep Research", "Sleep Disorders"),
         disease = str_replace(disease, "Smoking and Health", "Smoking Cessation")) %>% 
  separate(disease, into = c("disease", "remove"), sep = "\\_") %>% 
  select(-remove) %>% 
  separate(disease, into = c("disease", "remove"), sep = "\\(") %>% 
  select(-remove)
## Warning: Too few values at 1524 locations: 1, 2, 3, 4, 5, 7, 8, 9, 12, 13,
## 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, ...
## Warning: Too few values at 1566 locations: 1, 2, 3, 4, 5, 6, 8, 9, 12, 13,
## 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, ...
rm(NIH_funding)

final_NIH_funding
## # A tibble: 1,692 x 5
##                                                                 disease
##  *                                                                <chr>
##  1                                        Acquired Cognitive Impairment
##  2                                  Acute Respiratory Distress Syndrome
##  3                                           Adolescent Sexual Activity
##  4                                                Agent Orange & Dioxin
##  5                                                                Aging
##  6                                  Alcoholism, Alcohol Use and Health 
##  7                                                   Allergic Rhinitis 
##  8                                        Amyotrophic Lateral Sclerosis
##  9                                                  Alzheimer's Disease
## 10 Alzheimer's Disease including Alzheimer's Disease Related Dementias 
## # ... with 1,682 more rows, and 4 more variables: prevalence <dbl>,
## #   US_mortality_2015 <dbl>, year <chr>, funding <dbl>

Dataset 3: Patents

patents = read_csv("./data/patent.csv")
## Parsed with column specification:
## cols(
##   appl_type = col_character(),
##   appl_no = col_integer(),
##   product_no = col_integer(),
##   patent_no = col_character(),
##   patexpstr = col_character(),
##   patexp = col_character(),
##   drugsub = col_character(),
##   drugprod = col_character(),
##   patuse = col_character(),
##   delist = col_character()
## )
patents = patents %>% 
  clean_names() %>%
  rename(drug_type = appl_type) %>% 
  mutate(drug_type = recode(drug_type, "N" = "B")) %>%
  separate(patexp, into = c("month_pat_expires", "day_pat_expires", "year_pat_expires"), sep = "/") %>%
  separate(patent_no, into = c("patent_num", "pediatric_exclusivity_granted"), sep = "\\*")
## Warning: Too few values at 12167 locations: 1, 2, 3, 5, 7, 9, 10, 11, 12,
## 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, ...
patents$pediatric_exclusivity_granted[is.na(patents$pediatric_exclusivity_granted)] = "no"
patents$drugsub[is.na(patents$drugsub)] = "no"
patents$drugprod[is.na(patents$drugprod)] = "no"
patents$delist[is.na(patents$delist)] = "no"

patents = patents %>%
  mutate(pediatric_exclusivity_granted = factor(pediatric_exclusivity_granted, levels = c("no", "PED"), labels = c("No", "Yes"))) %>%
  mutate(drug_substance_flag = factor(drugsub, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
  mutate(drug_prod_flag = factor(drugprod, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
  mutate(patent_delisted = factor(delist, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
  select(-drugsub, -drugprod, -patexpstr) %>%
  rename(pat_use_code = patuse)
patents
## # A tibble: 13,858 x 13
##    drug_type appl_no product_no patent_num pediatric_exclusivity_granted
##        <chr>   <int>      <int>      <chr>                        <fctr>
##  1         B   22425          1    5223510                            No
##  2         B   22425          1    5223510                            No
##  3         B   22307          1    5288726                            No
##  4         B   22307          1    5288726                           Yes
##  5         B   21462          1    5344932                            No
##  6         B   21462          1    5344932                           Yes
##  7         B   22088          1    5362718                            No
##  8         B   22088          1    5362718                           Yes
##  9         B   21060          1    5364842                            No
## 10         B   21060          1    5364842                            No
## # ... with 13,848 more rows, and 8 more variables:
## #   month_pat_expires <chr>, day_pat_expires <chr>,
## #   year_pat_expires <chr>, pat_use_code <chr>, delist <chr>,
## #   drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## #   patent_delisted <fctr>

Dataset 4: Products

*The Products dataset from the FDA Orange Book

*Type: The group or category of approved drugs. RX (prescription), OTC (over the counter), or DISCN (discontinued).

*Applicant Full Name: The full name of the firm holding legal responsibility for the new drug application.

products = read_csv("./data/products.csv")
## Parsed with column specification:
## cols(
##   ingredient = col_character(),
##   dfroute = col_character(),
##   doseform = col_character(),
##   route = col_character(),
##   trade_name = col_character(),
##   applicant = col_character(),
##   strength = col_character(),
##   appl_type = col_character(),
##   appl_no = col_integer(),
##   product_no = col_integer(),
##   tecodestr = col_character(),
##   tecode = col_integer(),
##   appdatestr = col_character(),
##   appdate = col_character(),
##   rld = col_character(),
##   rs = col_character(),
##   type = col_character(),
##   appfull = col_character()
## )
products = products %>%
  rename(drug_type = appl_type,
         otc = type) %>% 
  mutate(drug_type = recode(drug_type, "A" = "G", "N" = "B"),
         otc = recode(otc, "OTC" = "yes", "RX" = "no", "DISCN" = "no")) %>%
  separate(appdate, into = c("approval_month", "approval_day", "approval_year"), sep = "/") %>%
  #separate(doseform, into = c("form", "release_type"), sep = ",")
  mutate(otc = factor(otc)) %>%
  mutate(ref_listed_drug = factor(rld)) %>%
  mutate(ref_std = factor(rs)) %>%
  select(-appdatestr, -rld, -rs, -dfroute)
products
## # A tibble: 34,209 x 18
##                                           ingredient doseform route
##                                                <chr>    <chr> <chr>
##  1                                  ABACAVIR SULFATE SOLUTION  ORAL
##  2                                  ABACAVIR SULFATE SOLUTION  ORAL
##  3                                  ABACAVIR SULFATE   TABLET  ORAL
##  4                                  ABACAVIR SULFATE   TABLET  ORAL
##  5                                  ABACAVIR SULFATE   TABLET  ORAL
##  6                                  ABACAVIR SULFATE   TABLET  ORAL
##  7                                  ABACAVIR SULFATE   TABLET  ORAL
##  8                                  ABACAVIR SULFATE   TABLET  ORAL
##  9 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE   TABLET  ORAL
## 10                      ABACAVIR SULFATE; LAMIVUDINE   TABLET  ORAL
## # ... with 34,199 more rows, and 15 more variables: trade_name <chr>,
## #   applicant <chr>, strength <chr>, drug_type <chr>, appl_no <int>,
## #   product_no <int>, tecodestr <chr>, tecode <int>, approval_month <chr>,
## #   approval_day <chr>, approval_year <chr>, otc <fctr>, appfull <chr>,
## #   ref_listed_drug <fctr>, ref_std <fctr>

Dataset 5: Exclusivity

We load and tidy the exclusivity dataset

exclusivity = read_csv("./data/exclusivity.csv") %>%
  clean_names() %>%
  rename(drug_type = appl_type) %>% 
  mutate(drug_type = recode(drug_type, "A" = "G", "N" = "B")) %>% 
  separate(excldate, into = c("month_excl_expires", "date_excl_expires", "year_excl_expires"))
## Parsed with column specification:
## cols(
##   appl_type = col_character(),
##   appl_no = col_integer(),
##   product_no = col_integer(),
##   exclcode = col_character(),
##   excldatestr = col_character(),
##   excldate = col_character()
## )
exclusivity
## # A tibble: 1,806 x 8
##    drug_type appl_no product_no exclcode  excldatestr month_excl_expires
##  *     <chr>   <int>      <int>    <chr>        <chr>              <chr>
##  1         G   65488          1       PC Mar 26, 2018                  3
##  2         G   65488          2       PC Mar 26, 2018                  3
##  3         G   78276          1       PC Apr 24, 2017                  4
##  4         G   78276          2       PC Apr 24, 2017                  4
##  5         G   78276          3       PC Apr 24, 2017                  4
##  6         G   78340          1       PC Jul 30, 2016                  7
##  7         G   78340          2       PC Jul 30, 2016                  7
##  8         G   78560          1       PC Jun 10, 2017                  6
##  9         G   78827          1       PC Apr 24, 2017                  4
## 10         G   78827          2       PC Apr 24, 2017                  4
## # ... with 1,796 more rows, and 2 more variables: date_excl_expires <chr>,
## #   year_excl_expires <chr>

The tidied exclusivity dataset has 1806 observations and 8 variables:

Dataset 6: Ingredient

We load the ingredient dataset.

ingredient = read_csv("./data/ingredient.csv") %>%
  clean_names() %>%
  select(singnum, singredient, everything())
## Parsed with column specification:
## cols(
##   ingredient = col_character(),
##   singnum = col_integer(),
##   singredient = col_character()
## )
ingredient
## # A tibble: 3,371 x 3
##    singnum         singredient
##      <int>               <chr>
##  1       1    ABACAVIR SULFATE
##  2       1    ABACAVIR SULFATE
##  3       2 DOLUTEGRAVIR SODIUM
##  4       3          LAMIVUDINE
##  5       1    ABACAVIR SULFATE
##  6       2          LAMIVUDINE
##  7       1    ABACAVIR SULFATE
##  8       2          LAMIVUDINE
##  9       3          ZIDOVUDINE
## 10       1       ABALOPARATIDE
## # ... with 3,361 more rows, and 1 more variables: ingredient <chr>

The tidied ingredient dataset has 3371 observations and 3 variables:

Dataset 7: Drug by Condition

Scraping data on drugs per condition from this website

urls = paste("https://www.centerwatch.com/drug-information/fda-approved-drugs/medical-conditions/", toupper(letters), sep = "")

get_drug_disease = function(url) {
  
  disease_data = read_html(url) %>% 
  html_nodes(css = ".ToggleDrugCategory") %>%
  html_text() %>% 
  as_tibble() %>% 
  mutate(key = value) %>% 
  rename(disease = value)
  
  drug_data = read_html(url) %>% 
  html_nodes(css = ".CategoryListSection a:nth-child(1)") %>%
  html_text() %>% 
  as_tibble() %>% 
  mutate(key = value) %>% 
  rename(drug = value)
  
  drug_disease_data = read_html(url) %>% 
  html_nodes(css = "#MainColumn a:nth-child(1)") %>%
  html_text() %>% 
  as_tibble() %>% 
  mutate(key = value) %>% 
  rename(drug_disease = value)
  
  drug_disease_joined = left_join(drug_disease_data, drug_data, disease_data, by = "key") %>% 
  left_join(., disease_data, by = "key") %>%
  fill(disease) %>%
  select(-c(drug,drug_disease))
  
  toremove = drug_disease_joined %>% distinct(disease) %>% pull()
  
  drug_disease_joined = drug_disease_joined %>% 
  filter(!key %in% toremove) %>% 
  rename(drug = key) %>% 
  .[-c(1:29),]
  
  drug_disease_joined
}

drug_disease_data = map_df(urls, get_drug_disease) %>% 
  mutate(disease = str_to_title(disease),
         drug = str_to_title(drug)) %>% 
  rename(trade_name = drug) %>%
  mutate(trade_name = trimws(trade_name, which = c("both"))) %>% 
  separate(trade_name, into = c("trade_name", "remove"), sep = " ") %>% 
  select(-remove)
## Warning: Too many values at 961 locations: 2, 3, 4, 5, 6, 8, 9, 10, 11, 12,
## 13, 14, 16, 18, 19, 21, 23, 24, 25, 26, ...
## Warning: Too few values at 139 locations: 7, 64, 68, 76, 109, 122, 132,
## 144, 145, 161, 162, 173, 174, 188, 197, 207, 221, 223, 228, 231, ...
drug_disease_data
## # A tibble: 1,883 x 2
##     trade_name disease
##  *       <chr>   <chr>
##  1       Avita    Acne
##  2  Benzamycin    Acne
##  3 Clindamycin    Acne
##  4 Clindamycin    Acne
##  5    Differin    Acne
##  6   Estrostep    Acne
##  7     Finevin    Acne
##  8      Klaron    Acne
##  9       Ortho    Acne
## 10     Retin-A    Acne
## # ... with 1,873 more rows

The drug_disease_data contains 1883 observations and 2 variables:

Combine Datasets into one

#A: join datasets by ingredient for ingredient and products (left join)
#products = longer dataset + ingredient
product_ingredient = left_join(products, ingredient, by = "ingredient")
product_ingredient
## # A tibble: 40,443 x 20
##                                           ingredient doseform route
##                                                <chr>    <chr> <chr>
##  1                                  ABACAVIR SULFATE SOLUTION  ORAL
##  2                                  ABACAVIR SULFATE SOLUTION  ORAL
##  3                                  ABACAVIR SULFATE   TABLET  ORAL
##  4                                  ABACAVIR SULFATE   TABLET  ORAL
##  5                                  ABACAVIR SULFATE   TABLET  ORAL
##  6                                  ABACAVIR SULFATE   TABLET  ORAL
##  7                                  ABACAVIR SULFATE   TABLET  ORAL
##  8                                  ABACAVIR SULFATE   TABLET  ORAL
##  9 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE   TABLET  ORAL
## 10 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE   TABLET  ORAL
## # ... with 40,433 more rows, and 17 more variables: trade_name <chr>,
## #   applicant <chr>, strength <chr>, drug_type <chr>, appl_no <int>,
## #   product_no <int>, tecodestr <chr>, tecode <int>, approval_month <chr>,
## #   approval_day <chr>, approval_year <chr>, otc <fctr>, appfull <chr>,
## #   ref_listed_drug <fctr>, ref_std <fctr>, singnum <int>,
## #   singredient <chr>
#B: exclusivity and patents left join (more~less)
patent_exclusivity = left_join(patents, exclusivity, by = c("appl_no", "product_no", "drug_type"))
patent_exclusivity
## # A tibble: 22,379 x 18
##    drug_type appl_no product_no patent_num pediatric_exclusivity_granted
##        <chr>   <int>      <int>      <chr>                        <fctr>
##  1         B   22425          1    5223510                            No
##  2         B   22425          1    5223510                            No
##  3         B   22307          1    5288726                            No
##  4         B   22307          1    5288726                            No
##  5         B   22307          1    5288726                           Yes
##  6         B   22307          1    5288726                           Yes
##  7         B   21462          1    5344932                            No
##  8         B   21462          1    5344932                           Yes
##  9         B   22088          1    5362718                            No
## 10         B   22088          1    5362718                           Yes
## # ... with 22,369 more rows, and 13 more variables:
## #   month_pat_expires <chr>, day_pat_expires <chr>,
## #   year_pat_expires <chr>, pat_use_code <chr>, delist <chr>,
## #   drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## #   patent_delisted <fctr>, exclcode <chr>, excldatestr <chr>,
## #   month_excl_expires <chr>, date_excl_expires <chr>,
## #   year_excl_expires <chr>
#C: join A and B left join: "Orange Book." Thursday: meet to eliminate variables (inner join: only commonalities are preserved) #eliminate drugs that do not match
orange_book = left_join(product_ingredient, patent_exclusivity, by = c("appl_no", "product_no", "drug_type"))

orange_book = orange_book %>%
  mutate(ingredient = str_to_title(ingredient),
         doseform = str_to_title(doseform),
         route = str_to_title(route),
         trade_name = str_to_title(trade_name),
         applicant = str_to_title(applicant),
         otc = str_to_title(otc),
         appfull = str_to_title(appfull),
         singredient = str_to_title(singredient),
         strength = str_to_title(strength))
orange_book
## # A tibble: 64,423 x 35
##          ingredient doseform route       trade_name            applicant
##               <chr>    <chr> <chr>            <chr>                <chr>
##  1 Abacavir Sulfate Solution  Oral Abacavir Sulfate  Hetero Labs Ltd Iii
##  2 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  3 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  4 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  5 Abacavir Sulfate Solution  Oral           Ziagen        Viiv Hlthcare
##  6 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate           Apotex Inc
##  7 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate Aurobindo Pharma Ltd
##  8 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate  Hetero Labs Ltd Iii
##  9 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate     Mylan Pharms Inc
## 10 Abacavir Sulfate   Tablet  Oral Abacavir Sulfate       Strides Pharma
## # ... with 64,413 more rows, and 30 more variables: strength <chr>,
## #   drug_type <chr>, appl_no <int>, product_no <int>, tecodestr <chr>,
## #   tecode <int>, approval_month <chr>, approval_day <chr>,
## #   approval_year <chr>, otc <chr>, appfull <chr>, ref_listed_drug <fctr>,
## #   ref_std <fctr>, singnum <int>, singredient <chr>, patent_num <chr>,
## #   pediatric_exclusivity_granted <fctr>, month_pat_expires <chr>,
## #   day_pat_expires <chr>, year_pat_expires <chr>, pat_use_code <chr>,
## #   delist <chr>, drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## #   patent_delisted <fctr>, exclcode <chr>, excldatestr <chr>,
## #   month_excl_expires <chr>, date_excl_expires <chr>,
## #   year_excl_expires <chr>
# final goal: disease, funding, drug prices, and Orange Book

# eliminate unnecessary variables from orange book dataset
orange_book_clean = orange_book %>%
  rename(application_no = appl_no, 
         active_ingredient = singredient, 
         dose_form = doseform) %>%
  select(-c(ingredient, applicant, product_no, tecodestr, tecode, approval_day, ref_std, singnum, day_pat_expires, pat_use_code, drug_substance_flag, drug_prod_flag, exclcode, excldatestr, date_excl_expires)) %>% 
  select(trade_name, everything()) %>% 
  separate(dose_form, into = c("dose_form", "remove")) %>%
  filter(is.na(remove)) %>% 
  select(-remove) %>% 
  filter(str_detect(dose_form, regex("Tablet", ignore_case = TRUE))) %>%
  mutate(trade_name = trimws(trade_name, which = c("both")),
         trade_name = str_replace_all(trade_name, ", ", "-"),
         trade_name = str_replace_all(trade_name, " And ", "-"))
## Warning: Too many values at 8293 locations: 78, 79, 80, 81, 82, 153, 154,
## 155, 156, 157, 158, 159, 634, 635, 636, 1085, 1086, 1087, 1088, 1089, ...
## Warning: Too few values at 51953 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
# inner join of disease drug and orange book by trade_name
final_dataset =
  new_drug_price %>%
  inner_join(., orange_book_clean, by = c("trade_name", "drug_type", "otc")) %>% 
  distinct() %>% 
  left_join(., drug_disease_data, by = "trade_name") %>% 
  distinct() %>% 
  mutate(disease = str_replace(disease, " And ", "_"),
         disease = str_replace(disease, "High Blood Pressure \\(", ""),
         disease = str_replace(disease, "Copd \\(", ""),
         disease = str_replace(disease, "\\/Hyperactivity", ""),
         disease = str_replace(disease, "\\)", ""),
         disease = str_replace(disease, " Mellitus, Type 2", ""),
         disease = str_replace(disease, " Mellitus Types I", ""),
         disease = str_replace(disease, "Hepatitis", "Hepatitis C"),
         disease = str_replace(disease, "Hepatitis C B", "Hepatitis B"),
         disease = str_replace(disease, "Hepatitis C C", "Hepatitis C"),
         disease = str_replace(disease, "Hiv/Aids", "HIV/AIDS"),
         disease = str_replace(disease, " \\- ", "_"),
         disease = str_replace(disease, "Migraine", "Migraines")
         ) %>%
  separate(disease, into = c("disease", "remove"), sep = "\\_") %>% 
  select(-remove) %>% 
  separate(disease, into = c("disease", "remove"), sep = "\\(") %>% 
  select(-remove) %>% 
  separate(disease, into = c("disease", "remove"), sep = "\\;") %>%
  select(-remove) %>% 
  left_join(., final_NIH_funding, by = c("disease", "year")) %>%
  as_tibble() %>%
  distinct() %>% 
  mutate(disease = str_replace(disease, "Chronic Obstructive Pulmonary Disease", "Chronic Obstructive Pulmonary Disorder"),
         disease = str_replace(disease, "Kidney Disease", "Chronic Kidney Disease"),
         disease = str_replace(disease, "Cardiac Ischemia", "Coronary Artery Disease"),
         disease = str_replace(disease, "Diabetes", "Diabetes Mellitus Types 1 & 2"),
         disease = str_replace(disease, "Bipolar Disorder", "Bipolar Mood Disorder"))
## Warning: Column `otc` joining factor and character vector, coercing into
## character vector
## Warning: Too few values at 29966 locations: 1366, 1367, 1368, 1369, 1370,
## 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1458, 1459, 1460, 1461, 1462,
## 1463, 1464, 1465, ...
## Warning: Too few values at 32342 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
## Warning: Too few values at 33027 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...

Josie

library(tidyverse)
library(rvest)
library(httr)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(plotly)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.2
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(forcats)
library(stringr)
library(viridis)
library(ggplot2)
library(dplyr)
library(flexdashboard)
## Warning: package 'flexdashboard' was built under R version 3.4.2
library(readxl)
library(janitor)

Do drugs that treat chronic illnesses (diabetes, heart disease, high blood pressure, etc) receive more or less funding, and pricing compared to acute illnesses?

How does pricing and funding compare across diseases (27 unique diseases in our “orange book”) regardless of disease type?

###chronic diseases

#first dataset
chronic_diseases_1_html = read_html("https://www.medicalschemes.com/medical_schemes_pmb/chronic_disease_list.htm")
chronic_diseases_1 = chronic_diseases_1_html %>%
  html_nodes("td td td .maintext") %>%
  html_text() %>%
  as_tibble() %>%
  rename(disease = value)

#second dataset
chronic_diseases_2_html = read_html("http://www.health24.com/Medical-schemes/PMB-and-chronic-disease/List-of-chronic-diseases-20120721")
chronic_diseases_2 = chronic_diseases_2_html %>%
  html_nodes("ul:nth-child(10) li , ul:nth-child(10) a") %>%
  html_text() %>%
  as_tibble() %>%
  rename(disease = value)

#third dataset
chronic_diseases_3_html = read_html("https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Chronic-Conditions/CC_Main.html")
chronic_diseases_3 = chronic_diseases_3_html %>%
  html_nodes("td") %>%
  html_text() %>%
  as_tibble() %>%
  rename(disease = value)

#final chronic disease dataset
chronic_diseases = rbind(chronic_diseases_1, chronic_diseases_2, chronic_diseases_3) %>%
  distinct()

#recode diseases
chronic_diseases$disease[62] = "Cancer"
chronic_diseases$disease[67] = "High Blood Pressure"
chronic_diseases$disease[43] = "High cholesterol"
chronic_diseases$disease[60] = "Austism"
chronic_diseases$disease[55] = "Arthritis"
chronic_diseases$disease[46] = "Hypothyroidism"
chronic_diseases$disease[38] = "Dysrhythmia"
chronic_diseases$disease[66] = "inactive thyroid gland"
chronic_diseases$disease[53] = "alzheimer's disease"
chronic_diseases$disease[26] = "Dementia"
chronic_diseases$disease[56] = "Hepatitis"
chronic_diseases$disease[71] = "COPD"

#need to include breast cancer, hepatitis b, hepatitis c, lung cancer, prostate cancer bc chronic disease set does not specify
#hepatitis
hepatitis_chronic = final_dataset %>%
  filter(str_detect(disease, regex("hepatitis", ignore_case = TRUE))) %>%
  pull(disease) %>%
  unique() %>%
  as_tibble() %>%
  rename(disease = value)

#cancer
cancer_chronic = final_dataset %>%
  filter(str_detect(disease, regex("cancer", ignore_case = TRUE))) %>%
  pull(disease) %>%
  unique() %>%
  as_tibble() %>%
  rename(disease = value)

#hiv/aids
hiv_aids_chronic = final_dataset %>%
  filter(str_detect(disease, regex("hiv", ignore_case = TRUE)) | str_detect(disease, regex("aids", ignore_case = TRUE))) %>%
  pull(disease) %>%
  unique() %>%
  as_tibble() %>%
  rename(disease = value)

#combine hepatitis, cancer and hiv/aids with chronic diseases
chronic_diseases = rbind(chronic_diseases, hepatitis_chronic, cancer_chronic, hiv_aids_chronic)
chronic_diseases = chronic_diseases %>%
  unique()

#indicator of chronic disease
indicator_chronic_disease = vector()
indicator_chronic_disease[1:nrow(chronic_diseases)] = c("yes")
indicator_chronic_disease = as.tibble(indicator_chronic_disease)

#final chronic disease dataset with indicator column (y/n)
chronic_diseases_final = cbind(chronic_diseases, indicator_chronic_disease) %>%
  rename(chronic_disease = value) 

#formatting for exact matching
final_dataset$disease = trimws(final_dataset$disease, which = "right")
chronic_diseases_final$disease = trimws(chronic_diseases_final$disease, which = "both")
final_dataset$disease = tolower(final_dataset$disease)
chronic_diseases_final$disease = tolower(chronic_diseases_final$disease)

# Join final dataset and chronic disease dataset to complete analysis.
final_dataset = left_join(final_dataset,chronic_diseases_final, by = "disease")
final_dataset$chronic_disease[is.na(final_dataset$chronic_disease)] = "no"
final_dataset = final_dataset %>%
  mutate(disease_type = recode(chronic_disease, 'no' = "acute", 'yes' = "chronic")) #%>%
  #select(everything(), -chronic_disease.x, -chronic_disease.y)

rm(chronic_diseases, indicator_chronic_disease, cancer_chronic, hepatitis_chronic, hiv_aids_chronic, chronic_diseases_1, chronic_diseases_2, chronic_diseases_3, chronic_diseases_1_html, chronic_diseases_2_html, chronic_diseases_3_html, chronic_diseases_final) #clean up workspace

FUNDING

#Let's examine funding based on whether or not a disease is chronic or acute

### EXPLORATORY ANALYSIS
#Summary Statistics for Funding: Chronic Disease vs. Acute Disease (by disease-->we have 27 unique diseases)
summary_funding_individual_diseases = final_dataset %>%
  filter(!is.na(disease)) %>% #exclude orange book entries without a disease entry
  group_by(disease_type, disease) %>% #this line and upward is general for chronic disease analysis
  filter(!is.na(funding)) %>%
  summarize(n = n(), mean = mean(funding), median = median(funding), standard_deviation = sd(funding), minimum = min(funding), maximum = max(funding)) 

#Which disease (top 3) has the highest median funding (by disease type)? 
max_funding_individual_diseases = summary_funding_individual_diseases %>%
  filter(min_rank(desc(median)) < 4)

#Which disease (bottom 3) has the lowest median funding (by disease type)? 
min_funding_individual_diseases = summary_funding_individual_diseases %>%
  filter(min_rank(median) < 4)

#Summary Statistics for Funding: Chronic Disease vs. Acute Disease (by disease type)
summary_funding_disease_type = final_dataset %>%
  filter(!is.na(disease)) %>% #exclude orange book entries without a disease entry
  group_by(disease_type) %>% #this line and upward is general for chronic disease analysis
  filter(!is.na(funding)) %>%
  summarize(n = n(), mean = mean(funding), median = median(funding), standard_deviation = sd(funding), minimum = min(funding), maximum = max(funding)) 

#Summary Statistics for Funding: Disease in General
summary_funding_by_disease = final_dataset %>%
  filter(!is.na(disease)) %>% #exclude orange book entries without a disease entry
  group_by(disease) %>% #this line and upward is general for chronic disease analysis
  filter(!is.na(funding)) %>%
  summarize(n = n(), mean = mean(funding), median = median(funding), standard_deviation = sd(funding), minimum = min(funding), maximum = max(funding)) 

#Which disease (top 3) has the highest median funding? 
max_funding = summary_funding_by_disease %>%
  filter(min_rank(desc(median)) < 4)

#Lowest funding (lowest 3)?
min_funding = summary_funding_by_disease %>%
  filter(min_rank(median) < 4)
#Graphs for Funding (PLOT_LY!)
# 1. Diseases with higest median funding (grouped by disease_type) (top 3) (bar chart) NEED HELP REARRANGING FROM SMALLEST TO LARGEST
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  # mutate(disease = fct_infreq(disease),
  #        disease_type = fct_infreq(disease_type)) %>%
  group_by(disease_type, disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(desc(median)) < 4) %>%
  plot_ly(x = ~disease, y = ~median, color = ~disease_type, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
  #possible help?
  # mutate(cuisine_description = fct_reorder(cuisine_description, score)) %>%
  # group_by(cuisine_description, grade) %>%
  # summarize(mean_score = mean(score)) %>%
  # plot_ly(x = ~cuisine_description, y = ~mean_score, color = ~grade, type = "bar")
  
# 2. Diseases with lowest median funding (bottom 3) (bar chart)
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  group_by(disease_type, disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(median) < 4) %>%
  plot_ly(x = ~disease, y = ~median, color = ~disease_type, type = "bar", colors = "Set2")
# 1A. Overall: Diseases with highest median funding
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  # mutate(disease = fct_infreq(disease)) %>%
  group_by(disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(desc(median)) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 2A. Overall: Diseases with lowest median funding
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  group_by(disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(median) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 1B. Acute Diseases with highest median funding
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding) & disease_type == "acute") %>%
  group_by(disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(desc(median)) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 1C. Chronic Diseases with highest median funding
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding) & disease_type == "chronic") %>%
  group_by(disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(desc(median)) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 2B. Acute Diseases with lowest median funding
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding) & disease_type == "acute") %>%
  group_by(disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(median) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set1")
# 2C. Chronic Diseases with lowest median funding
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding) & disease_type == "chronic") %>%
  group_by(disease) %>% 
  summarize(median = median(funding)) %>%
  filter(min_rank(median) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set3")
# 3. Funding by disease status (boxplot)
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  mutate(disease_type = fct_reorder(disease_type, funding)) %>%
  group_by(disease_type) %>% 
  plot_ly(y = ~funding, type = "box", color = ~disease_type, colors = "Set3")
## Warning in mutate_impl(.data, dots): Unequal factor levels: coercing to
## character
## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector
# 4. Funding by disease status & disease (boxplot)

#by most orange book entries (Could try by prevalence)
most_entries_diseases = final_dataset %>%
  group_by(disease_type) %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  count(disease, sort = TRUE) %>%
  top_n(3) %>%
  select(disease)
## Selecting by n
## Adding missing grouping variables: `disease_type`
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  inner_join(most_entries_diseases) %>%
  # mutate(disease_type = fct_reorder(disease_type, funding), 
  #        disease = fct_reorder(disease)) %>%
  group_by(disease) %>% 
  plot_ly(y = ~funding, type = "box", color = ~disease, colors = "Set3")
## Joining, by = c("disease", "disease_type")
#by prevalence NOT WORKING!
# most_prevalent_diseases = final_dataset %>%
#   group_by(disease, disease_type) %>%
#   filter(!is.na(disease) & !is.na(funding) & !is.na(funding)) %>%
#   filter(min_rank(desc(prevalence)) < 4) %>%
#   select(disease)
  
# 5. Prevalence vs. Median Funding based on Disease Type. Do more prevalent Chronic or Acute Diseases receive more funding?
#line graph
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  group_by(disease_type, prevalence) %>% 
  summarize(median_funding = median(funding)) %>%
  plot_ly(x = ~prevalence, y = ~median_funding, type = "scatter", mode = "lines", color = ~disease_type, colors = "Set1")
#scatter plot
final_dataset %>%
  filter(!is.na(disease) & !is.na(funding)) %>%
  group_by(disease_type, disease, prevalence) %>% 
  summarize(median_funding = median(funding)) %>%
  plot_ly(x = ~prevalence, y = ~median_funding, type = "scatter", color = ~disease_type, colors = "Set1")
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 14 observations

PRICING

summary_pricing_individual_diseases = final_dataset %>%
  filter(!is.na(disease)) %>% 
  group_by(disease_type, disease) %>% 
  filter(!is.na(avg_price)) %>%
  summarize(n = n(), mean = mean(avg_price), median = median(avg_price), standard_deviation = sd(avg_price), minimum = min(avg_price), maximum = max(avg_price)) 

#Which disease (top 3) has the highest median pricing (by disease type)? 
max_price_individual_diseases = summary_pricing_individual_diseases %>%
  filter(min_rank(desc(median)) < 4)

#Which disease (bottom 3) has the lowest median pricing (by disease type)? 
min_price_individual_diseases = summary_pricing_individual_diseases %>%
  filter(min_rank(median) < 4)

#Summary Statistics for Pricing: Chronic Disease vs. Acute Disease (by disease type)
summary_pricing_disease_type = final_dataset %>%
  filter(!is.na(disease)) %>% 
  group_by(disease_type) %>% 
  filter(!is.na(avg_price)) %>%
  summarize(n = n(), mean = mean(avg_price), median = median(avg_price), standard_deviation = sd(avg_price), minimum = min(avg_price), maximum = max(avg_price))  

#Summary Statistics for Pricing: Disease in General
summary_pricing_by_disease = final_dataset %>%
  filter(!is.na(disease)) %>% 
  group_by(disease) %>% 
  filter(!is.na(avg_price)) %>%
  summarize(n = n(), mean = mean(avg_price), median = median(avg_price), standard_deviation = sd(avg_price), minimum = min(avg_price), maximum = max(avg_price)) 

#Which disease (top 3) has the highest median pricing? 
max_pricing = summary_pricing_by_disease %>%
  filter(min_rank(desc(median)) < 4)

#Lowest pricing (lowest 3)?
min_pricing = summary_pricing_by_disease %>%
  filter(min_rank(median) < 4)
#Graphs for Pricing (PLOT_LY!)
# 1. Diseases with higest median pricing (grouped by disease_type) (top 3) (bar chart) NEED HELP REARRANGING FROM SMALLEST TO LARGEST
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  group_by(disease_type, disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(desc(median)) < 4) %>%
  plot_ly(x = ~disease, y = ~median, color = ~disease_type, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# 2. Diseases with lowest median pricing (bottom 3) (bar chart)
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  group_by(disease_type, disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(median) < 4) %>%
  plot_ly(x = ~disease, y = ~median, color = ~disease_type, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# 1A. Overall: Diseases with highest median Pricing
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  # mutate(disease = fct_infreq(disease)) %>%
  group_by(disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(desc(median)) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 2A. Overall: Diseases with lowest median Pricing
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  group_by(disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(median) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 1B. Acute Diseases with highest median Pricing
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price) & disease_type == "acute") %>%
  group_by(disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(desc(median)) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 1C. Chronic Diseases with highest median Pricing
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price) & disease_type == "chronic") %>%
  group_by(disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(desc(median)) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set2")
# 2B. Acute Diseases with lowest median Pricing
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price) & disease_type == "acute") %>%
  group_by(disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(median) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set1")
# 2C. Chronic Diseases with lowest median Pricing
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price) & disease_type == "chronic") %>%
  group_by(disease) %>% 
  summarize(median = median(avg_price)) %>%
  filter(min_rank(median) < 6) %>%
  plot_ly(x = ~disease, y = ~median, type = "bar", color = ~disease, colors = "Set3")
# 3. Pricing by disease status (boxplot)
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  mutate(disease_type = fct_reorder(disease_type, avg_price)) %>%
  group_by(disease_type) %>% 
  plot_ly(y = ~avg_price, type = "box", color = ~disease_type, colors = "Set3")
## Warning in mutate_impl(.data, dots): Unequal factor levels: coercing to
## character
## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector

## Warning in mutate_impl(.data, dots): binding character and factor vector,
## coercing into character vector
# 4. Pricing by disease status & disease (boxplot)

#by most orange book entries (Could try by prevalence)
most_entries_diseases = final_dataset %>%
  group_by(disease_type) %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  count(disease, sort = TRUE) %>%
  top_n(3) %>%
  select(disease)
## Selecting by n
## Adding missing grouping variables: `disease_type`
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  inner_join(most_entries_diseases) %>%
  group_by(disease) %>% 
  plot_ly(y = ~avg_price, type = "box", color = ~disease, colors = "Set3")
## Joining, by = c("disease", "disease_type")
# 5. Prevalence vs. Median Pricing based on Disease Type. Do more prevalent Chronic or Acute Diseases have higher average prices?
#line graph
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  group_by(disease_type, prevalence) %>% 
  summarize(median_avg_price = median(avg_price)) %>%
  plot_ly(x = ~prevalence, y = ~median_avg_price, type = "scatter", mode = "lines", color = ~disease_type, colors = "Set1")
#scatter plot. the less prevalent a disease, the higher the median_avg price
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  group_by(disease_type, disease, prevalence) %>% 
  summarize(median_avg_price = median(avg_price)) %>%
  plot_ly(x = ~prevalence, y = ~median_avg_price, type = "scatter", color = ~disease_type, colors = "Set1")
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 97 observations

For 2016, and among top 10 drugs for advertising, is how much money spent on advertising correlated with unit price? Funding?

#read in advertising dataset
top_advertising_cost = read_excel("./data/advertising_cost.xlsx", 
    sheet = "Data", skip = 3) %>%
  clean_names() %>%
  rename(trade_name = leading_pharmaceutical_brands_in_the_united_states_in_2016_by_national_tv_ad_spend_in_million_u_s_dollars, spending_million_dollars = spending_in_million_u_s_dollars)

# pricing
summary_advertising_pricing = final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  inner_join(top_advertising_cost) %>%
  summarize(median_avg_price = median(avg_price))
## Joining, by = "trade_name"
# pricing with disease and disease type (more detailed)
summary_advertising_pricing = final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  inner_join(top_advertising_cost) %>%
  group_by(trade_name, disease, disease_type) %>%
  summarize(median_avg_price = median(avg_price))
## Joining, by = "trade_name"
# boxplot comparing most advertising drugs
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  inner_join(top_advertising_cost) %>%
  group_by(trade_name) %>% 
  plot_ly(y = ~avg_price, type = "box", color = ~trade_name, colors = "Set3")
## Joining, by = "trade_name"
### IS there a way for me to have the disease/disease_type AND have name of drug displayed????
# derivation of the above boxplot (categorical data as the disease they treat)
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  inner_join(top_advertising_cost) %>%
  group_by(trade_name, disease, disease_type) %>%
  plot_ly(y = ~avg_price, type = "box", color = ~disease, colors = "Set3")
## Joining, by = "trade_name"
# by disease type they treat
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  inner_join(top_advertising_cost) %>%
  group_by(trade_name, disease, disease_type) %>%
  plot_ly(y = ~avg_price, type = "box", color = ~disease_type, colors = "Set3")
## Joining, by = "trade_name"
# pricing and advertising 
final_dataset %>%
  filter(!is.na(disease) & !is.na(avg_price)) %>%
  inner_join(top_advertising_cost) %>%
  group_by(trade_name, spending_million_dollars) %>%
  summarize(median_avg_price = median(avg_price)) %>%
  plot_ly(x = ~median_avg_price, y = ~spending_million_dollars, color = ~trade_name, type = "scatter", colors = "Set2")
## Joining, by = "trade_name"
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

What about drug sales?

top_prescribed_drugs_sales = read_excel("./data/top_prescribed-drugs-sales-2016.xlsx",
    sheet = "Data", skip = 3) %>%
  clean_names() %>%
  rename(trade_name = top_20_prescription_drugs_based_on_u_s_sales_in_2016_in_billion_u_s_dollars, sales_billion_dollars = sales_in_billion_u_s_dollars)
# Remomve some unecessary datasets
rm(patent_exclusivity, product_ingredient)

Manali

Next, we looked to explore whether there was a relationship between prevalence and funding. While the number of diseases with the required complete data was relatively small, it appears from the scatterplot that when you have a low prevalence, there is a wide range of the average funding. As the prevalence increases the funding amount generally seems to be on the lower side. However, if you look more closely at which diseases are included, some interesting observations can be made. Many of the diseases with high funding are cancers, which have a low prevalence. This makes sense as there is a lot of research focused on cancer and new drug development. Most cancers have large subgroups with no good treatment options, so it makes sense that a large amount of funding is devoted to cancer. Some diseases with higher prevalence but lower funding are hypertension, migraines, ADD, and COPD, which do already have treatment options available on the market.

Then, we looked to determine whether funding correlated with drug prices. This does not really seem to be the case from the scatterplot. Along the entire range of funding amounts, most drugs still have relatively low prices. It is interesting that different drugs that treat the same disease seem to be largely clustered together. We see breast cancer treatments are unusually high in price, which again makes sense given what we know about cancer treatment.

#do rare diseases recieve less funding?
#set.seed(1)
final_dataset %>%
  #sample_n(50)%>%
  group_by(disease, prevalence) %>%
  #filter(!is.na(disease)) %>%
  #filter(!is.na(prevalence)) %>%
  summarize(avg_funding = mean(funding, na.rm = TRUE)) %>%
  plot_ly(x = ~ prevalence, y = ~ avg_funding, type = "scatter", color = ~disease, text = ~paste('Disease:', disease))
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 99 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#does funding correlate with drug prices?

final_dataset %>%
  #sample_n(50)%>%
  group_by(disease, funding, avg_price) %>%
  summarize(avg_funding = mean(funding, na.rm = TRUE)) %>%
  #plot_ly(x = ~funding, y = ~avg_price, type = "scatter")
  plot_ly(x = ~ avg_funding, y = ~ avg_price, type = "scatter", color = ~disease, text = ~paste('Disease:', disease))
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 2305 observations

## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Muhire

We created a spaghetti plot of the average price of drugs per year for each drug. We only used data on prices per drug unit (mainly per tablet). Information about the disease that the drugs treat can be seen by hovering over the plot. We notice that of the drugs with information on price per unit drug, the drugs for Hepatitis C seem to have the highest cost.

Possible case studies on most popular (perceived) drugs: Lipitor (Schizophrenia; not in dataset), Xanax (Panic Disorders, not in dataset), Celebrex, Prozac, Crestor, Nexium, Lisinopril, Zoloft (Panic Disorders; present in dataset. Almost doubled from 2013 to 2017)

final_dataset %>% 
  filter(disease %in% c("Panic Disorders")) %>% 
  filter(pricing_unit == "EA" & !is.na(disease)) %>% 
  plot_ly(x = ~year, y = ~avg_price, type = "scatter", mode = "lines", text = ~paste("Drug: ", trade_name, '<br>Disease:', disease), color = ~trade_name, alpha = 0.5) %>% 
  layout(showlegend = FALSE)

We also created a scatter plot of the mean NIH funding per drug (assumed to be the same as the NIH funding for disease research) against the demand of the drug (defined as the 2015 disease prevalence) colored by disease. Among drugs with available information on price per unit drug, obesity seems to have received the most funding on average and to have the highest 2015 prevalence.

final_dataset %>% 
  filter(pricing_unit == "EA" & !is.na(funding) & !is.na(prevalence)) %>% 
  group_by(trade_name, disease, prevalence) %>% 
  summarize(mean_fund = mean(funding)) %>% 
  plot_ly(x = ~prevalence, y = ~mean_fund, type = "scatter", alpha = 1, text = ~paste("Drug: ", trade_name, '<br>Disease:', disease), color = ~disease) %>%
  layout(showlegend = FALSE)
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

MeOak